Train 80 %-> real train 75% ,weight train 5% (weight from t-1)

test 20%

Code
library(tidyverse)
library(janitor)
library(ggplot2)
library(dplyr)
library(rugarch)
library(gganimate)
Code
stock <- read.csv("data/individual_book_train/stock_100.csv")
Code
stock <- stock %>% mutate(
  WAP = (bid_price1 * ask_size1 + ask_price1 * bid_size1) / (bid_size1 + ask_size1)
  )

stock <- stock %>% mutate(
  BidAskSpread = ask_price1 / bid_price1 - 1
  )

stock <- stock %>% mutate(
  imbalance = abs((bid_size1 - ask_size1) / (bid_size1 + ask_size1))
  )


log_rs <- list()
imba_mean <- vector()
BAS_mean <- vector()

#time_IDs <- unique(stock$time_id)
time_IDs <- unique(stock[, 1])[1:20]


for (i in 1 : length(time_IDs)) {
  sec <- stock %>% filter(time_id == time_IDs[i]) %>% pull(seconds_in_bucket)
  price <- stock %>% filter(time_id == time_IDs[i]) %>% pull(WAP)
  imbad <- stock %>% filter(time_id == time_IDs[i]) %>% pull(imbalance)
  BASD <- stock %>% filter(time_id == time_IDs[i]) %>% pull(BidAskSpread)
  
  imba_mean[[i]] <- mean(imbad) 
  BAS_mean[[i]] <- mean(BASD)
  
  log_r <- log(price[-1] / price[1:(length(price) - 1)])
  log_rs[[i]] <- data.frame(time = sec[-1], log_return = log_r)
  time.no.change <- (1:600)[!(1:600 %in% log_rs[[i]]$time)]
  if (length(time.no.change) > 0) {
    new.df <- data.frame(time = time.no.change, log_return = 0)
    log_rs[[i]] <- rbind(log_rs[[i]], new.df)
    log_rs[[i]] <- log_rs[[i]][order(log_rs[[i]]$time), ]
  }
}

vol <- list()
comp_vol <- function(x) {
  return(sqrt(sum(x ^ 2)))
}
for (i in 1 : length(log_rs)) {
  log_rs[[i]] <- log_rs[[i]] %>% mutate(time_bucket = ceiling(time / 30))
  vol[[i]] <- aggregate(log_return ~ time_bucket, data = log_rs[[i]], FUN = comp_vol)
  colnames(vol[[i]]) <- c('time_bucket', 'volatility')
}

Cluster ?

Code
#put into cluster base on Plot ***Eye balling***
cluster_l <- vector()
for (i in 1:length(vol)) {
  if (BAS_mean[[i]] > 0.15){cluster_l <- 4}
  else if (imba_mean[[i]] > 0.61) {cluster_l[[i]] <- 3}
  else if (imba_mean[[i]] < 0.45) {cluster_l[[i]] <- 2}
  else {cluster_l[[i]] <- 1}
}

cluster_l
 [1] 3 3 2 1 3 1 2 1 2 2 1 1 1 3 2 1 3 2 2 2

model

egarch

Code
spec <- ugarchspec(variance.model = list(model = "eGARCH", garchOrder = c(1, 1)), 
                   mean.model = list(armaOrder = c(0, 0)), 
                   distribution.model = "norm")
ARMA_GARCH.models <- list()

# filter time 450 for first 75% train
for (i in 1 : length(vol)) {
  ARMA_GARCH.models[[i]] <- ugarchfit(spec = spec, data = log_rs[[i]] %>% 
                                        filter(time <= 450) %>% pull(log_return),
                                      solver = 'hybrid')
}

# 30weight 120 real predict
n_w = 30
n_p = 120

garch_weight <- vector()
pred1 <- list()
pred1_adjust <- list()
for (i in 1 : length(vol)) {

  fitted <- rep(1,n_p)
  pred1[[i]] <- data_frame(fitted)
  fitted <- rep(1:4)
  pred1_adjust[[i]] <- data_frame(fitted)
  
  fspec <- getspec(ARMA_GARCH.models[[i]])
  setfixed(fspec) <- as.list(coef(ARMA_GARCH.models[[i]]))
  future.path <- fitted(ugarchpath(fspec, n.sim = 150, m.sim = 1000))
  future.path[is.na(future.path)] <- 0
  
  garch_weight[i] <- mean(sqrt(colSums(future.path[1:30,]^2)))
  pred1_adjust[[i]]$fitted[1] <- mean(sqrt(colSums(future.path[31:60,]^2)))
  pred1_adjust[[i]]$fitted[2] <- mean(sqrt(colSums(future.path[61:90,]^2)))
  pred1_adjust[[i]]$fitted[3] <- mean(sqrt(colSums(future.path[91:120,]^2)))
  pred1_adjust[[i]]$fitted[4] <- mean(sqrt(colSums(future.path[121:150,]^2)))

}

wlr

Code
vol.train <- list()
vol.val <- list()
vol.w <- list()

for (i in 1 : length(log_rs)) {
  vol.train[[i]] <- vol[[i]][1:15, ]
  vol.val[[i]] <- vol[[i]][-(1:15), ]
}

list.reg <- list()
stocklm <- stock %>% mutate(time_bucket = ceiling(seconds_in_bucket / 30),
                            num_order = bid_size1 + ask_size1 + bid_size2 + ask_size2)
len.train <- length(vol.train[[1]]$volatility)

for (i in 1 : length(vol)) {
  stats.bucket <- stocklm %>% 
    filter(time_id == time_IDs[i] & time_bucket != 0) %>% 
    select(c(BidAskSpread, WAP, num_order, time_bucket)) 
  mean.price <- aggregate(WAP ~ time_bucket, data = stats.bucket, FUN = mean)
  mean.order <- aggregate(num_order ~ time_bucket, data = stats.bucket, FUN = mean)
  mean.BAS <- aggregate(BidAskSpread ~ time_bucket, data = stats.bucket, FUN = mean)
  list.reg[[i]] <- data.frame(volatility = vol.train[[i]]$volatility[-1], 
                              price = mean.price$WAP[1:(len.train - 1)],
                              order = mean.order$num_order[1:(len.train - 1)],
                              BidAskSpread = mean.BAS$BidAskSpread[1:(len.train - 1)])
}

lm.models <- list()

for (i in 1 : length(vol)) {
  lm.models[[i]] <- lm(volatility ~ price + order + BidAskSpread, list.reg[[i]],
                       weights = 0.8 ^ (((len.train - 2):0) / 2))
}
Code
list.reg.val <- list()
len.val <- length(vol.val[[1]]$volatility)
pred.lm <- list()

for (i in 1 : length(vol)) {
  stats.bucket <- stocklm %>% 
    filter(time_id == time_IDs[i] & time_bucket != 0) %>% 
    select(c(BidAskSpread, WAP, num_order, time_bucket))
  mean.price <- aggregate(WAP ~ time_bucket, data = stats.bucket, FUN = mean)
  mean.order <- aggregate(num_order ~ time_bucket, data = stats.bucket, FUN = mean)
  mean.BAS <- aggregate(BidAskSpread ~ time_bucket, data = stats.bucket, FUN = mean)
  list.reg.val[[i]] <- 
    data.frame(volatility = vol.val[[i]]$volatility, 
               price = mean.price$WAP[len.train:(len.train + len.val - 1)],
               order = mean.order$num_order[len.train:(len.train + len.val - 1)],
               BidAskSpread = mean.BAS$BidAskSpread[len.train:(len.train + len.val - 1)])
  pred.lm[[i]] <- predict(lm.models[[i]], newdata = list.reg.val[[i]])
}
pred2 <- pred.lm

hav

Code
list.HAV <- list()

for (i in 1 : length(vol)) {
  mean.vol <- rep(0, len.train - 5)
  for (j in 1 : 5) {
    mean.vol <- mean.vol + vol.train[[i]]$volatility[j : (j + len.train - 6)] / 5
  }
  list.HAV[[i]] <- data.frame(vol = vol.train[[i]]$volatility[-(1:5)], 
                              vol_1 = vol.train[[i]]$volatility[5:(len.train - 1)],
                              mean_vol_5 = mean.vol)
}


quar <- list()
comp_quar <- function(x) {
  return(length(x) / 3 * sum(x ^ 4))
}
for (i in 1 : length(log_rs)) {
  quar[[i]] <- aggregate(log_return ~ time_bucket, data = log_rs[[i]], FUN = comp_quar)
  colnames(quar[[i]]) <- c('time_bucket', 'quarticity')
}

HAV.wls.models <- list()

for (i in 1 : length(vol)) {
  HAV.wls.models[[i]] <- lm(vol ~ vol_1 + mean_vol_5, list.HAV[[i]],
                            weights = list.HAV[[i]]$vol_1 / 
                              sqrt(quar[[i]]$quarticity[5:(len.train - 1)]))
}
Code
pred.hav.all <- list()
for (j in 1:1) {
  pred.hav <- list()
  latest_obs <- list()
  list_HAV1_cluster <- list()
  for (i in 1:length(vol)) {
    # This will predict 16, 17, 18, 19, 20
    latest_obs[[i]] <- vol.train[[i]]$volatility[11:15]
    for (t in 1:5) {
      # Compute mean volatility for the last 5 observations
      mean.vol <- sum(latest_obs[[i]])/5
      # Create data frame with updated vol_1 and mean_vol_5
      list_HAV1_cluster[[i]] <- data.frame(
        vol_1 = latest_obs[[i]][5],
        mean_vol_5 = mean.vol
      )
      pred.hav[[t]] <- unname(predict(HAV.wls.models[[i]], newdata = list_HAV1_cluster[[i]]))
      # Drop the oldest observation and add new predicted value 
      latest_obs[[i]] <- c(latest_obs[[i]][-1], pred.hav[[t]])
   }
   #cluster_pred_lm[[j]][[i]] <- latest_obs
 }
 pred.hav.all[[j]] <- latest_obs
}

#pred.hav.all[[1]][[1]][[1]]
pred3 <- list()
for (i in 1:length(vol)){
  pred3[[i]] <- pred.hav.all[[1]][[i]]
}

mix

cluster 1,3 = HAV + WLR

cluster 2,4 = EGARCH+ WLR

Code
mix <- list()

for(i in 1:length(vol)){
  pred_f <- rep(1,4)
  mod_a <- rep(1,4)
  mod_b <- rep(1,4)
  alpha_w <- rep(1,4)
  beta_w <- rep(1,4)
  val <- rep(1,4)
  time <- c(17,18,19,20)
  
  mix[[i]] <- data.frame(time,pred_f,mod_a,mod_b,alpha_w,beta_w,val)
  
  ###val
  mix[[i]]$val <- vol.val[[i]]$volatility[2:5]
  
  
  if(cluster_l[[i]] == 1 | cluster_l[[i]] == 3){
    mix[[i]]$mod_a <- pred3[[i]][2:5]
    mix[[i]]$mod_b <- c(pred2[[i]][[2]],pred2[[i]][[3]],pred2[[i]][[4]],pred2[[i]][[5]])
    
    pa <- garch_weight[[i]]
    pb <- pred2[[i]][[1]]
  }
  
  
  else {
    mix[[i]]$mod_a <- pred1_adjust[[i]]$fitted
    mix[[i]]$mod_b <- c(pred2[[i]][[2]],pred2[[i]][[3]],pred2[[i]][[4]],pred2[[i]][[5]])
    pa <- garch_weight[[i]]
    pb <- pred2[[i]][[1]]
  }
  
 
  #16 -> 17
  a = 0
  b = 1
  sm_err = 99999
  best_a = 0
  for(w in 1:11){
    m_cal <- a*pa + b*pb
    ab_err <- abs(m_cal - vol.val[[i]]$volatility[[1]])
    if(ab_err < sm_err){
      sm_err <- ab_err
      best_a <- a
    }
    a <- a+0.1
    b <- b-0.1
  }
  mix[[i]]$alpha_w[[1]] <- best_a
  mix[[i]]$beta_w[[1]] <- round(1-best_a,digit = 1)
  
  #17-19 -> 18-20 alpha
  for(j in 1:3){
    a = 0
    b = 1
    sm_err = 99999
    best_a = 0
    for(w in 1:11){
      m_cal <- a*mix[[i]]$mod_a[[j]] + b*mix[[i]]$mod_b[[j]]
      ab_err <- abs(m_cal - vol.val[[i]]$volatility[[j+1]])
      if(ab_err < sm_err){
        sm_err <- ab_err
        best_a <- a
      }
      a <- a+0.1
      b <- b-0.1
    }
    mix[[i]]$alpha_w[[j+1]] <- best_a
    mix[[i]]$beta_w[[j+1]] <- round(1-best_a,digit = 1)
  }
  ###mix
  mix[[i]]$pred_f <- ((mix[[i]]$mod_a*mix[[i]]$alpha_w) + (mix[[i]]$mod_b*mix[[i]]$beta_w))
  
}

result

value table

Code
mix
[[1]]
  time       pred_f        mod_a        mod_b alpha_w beta_w          val
1   17 0.0010260675 0.0010260675 0.0012196350       1      0 0.0015946080
2   18 0.0009863879 0.0010015944 0.0009863879       0      1 0.0007010344
3   19 0.0007984031 0.0009732926 0.0007984031       0      1 0.0007203055
4   20 0.0011301855 0.0010269748 0.0011301855       0      1 0.0008264389

[[2]]
  time       pred_f        mod_a        mod_b alpha_w beta_w          val
1   17 3.861298e-05 0.0005286728 3.861298e-05     0.0    1.0 0.0006225340
2   18 5.574654e-04 0.0005574654 2.535723e-04     1.0    0.0 0.0003323998
3   19 3.434675e-04 0.0005524532 2.539023e-04     0.3    0.7 0.0001328152
4   20 1.260451e-04 0.0005043462 1.260451e-04     0.0    1.0 0.0001088957

[[3]]
  time       pred_f        mod_a        mod_b alpha_w beta_w          val
1   17 0.0011106308 0.0008619094 0.0011106308       0      1 0.0011320570
2   18 0.0009069263 0.0008653487 0.0009069263       0      1 0.0010788897
3   19 0.0010573743 0.0008720265 0.0010573743       0      1 0.0007937135
4   20 0.0008682097 0.0008682097 0.0010463319       1      0 0.0007345000

[[4]]
  time       pred_f        mod_a        mod_b alpha_w beta_w          val
1   17 0.0006207086 0.0006141766 0.0006207086       0      1 0.0004778847
2   18 0.0006178964 0.0006178964 0.0005804300       1      0 0.0005119734
3   19 0.0005614391 0.0006136615 0.0005614391       0      1 0.0007271090
4   20 0.0006155874 0.0006155874 0.0006149317       1      0 0.0001894792

[[5]]
  time       pred_f        mod_a        mod_b alpha_w beta_w          val
1   17 0.0004222089 0.0002965554 0.0004222089       0      1 0.0001584394
2   18 0.0005178913 0.0005178913 0.0003642260       1      0 0.0007691548
3   19 0.0004737087 0.0004737087 0.0004001583       1      0 0.0006672344
4   20 0.0005147503 0.0005147503 0.0003852155       1      0 0.0008105287

[[6]]
  time       pred_f       mod_a        mod_b alpha_w beta_w          val
1   17 0.0018954948 0.001254510 0.0018954948     0.0    1.0 0.0013235207
2   18 0.0012837161 0.001294379 0.0011877482     0.9    0.1 0.0009859261
3   19 0.0012316830 0.001317241 0.0012316830     0.0    1.0 0.0011174693
4   20 0.0007417947 0.001332193 0.0007417947     0.0    1.0 0.0015402672

[[7]]
  time      pred_f       mod_a       mod_b alpha_w beta_w         val
1   17 0.002108687 0.002180509 0.002108687       0      1 0.002893711
2   18 0.002167896 0.002167896 0.001994708       1      0 0.002303492
3   19 0.002166226 0.002166226 0.002135026       1      0 0.001550287
4   20 0.002338768 0.002182540 0.002338768       0      1 0.001924196

[[8]]
  time      pred_f       mod_a       mod_b alpha_w beta_w          val
1   17 0.001239076 0.001239076 0.001543944       1      0 0.0009875863
2   18 0.001188923 0.001188923 0.001495689       1      0 0.0012000349
3   19 0.001166031 0.001166031 0.001443581       1      0 0.0015288547
4   20 0.001388200 0.001220593 0.001388200       0      1 0.0013299081

[[9]]
  time       pred_f        mod_a        mod_b alpha_w beta_w          val
1   17 0.0006487408 0.0008120296 0.0006079186     0.2    0.8 0.0005099655
2   18 0.0006925670 0.0008076621 0.0006925670     0.0    1.0 0.0010484798
3   19 0.0008036442 0.0008036442 0.0007425936     1.0    0.0 0.0008481560
4   20 0.0008098678 0.0008098678 0.0006439829     1.0    0.0 0.0005676446

[[10]]
  time       pred_f        mod_a        mod_b alpha_w beta_w          val
1   17 0.0007456589 0.0008604422 0.0006691367     0.4    0.6 0.0004338248
2   18 0.0004130729 0.0008751826 0.0004130729     0.0    1.0 0.0006972630
3   19 0.0006782799 0.0008679354 0.0003937966     0.6    0.4 0.0004923812
4   20 0.0008804542 0.0008629057 0.0008848413     0.2    0.8 0.0006493527

[[11]]
  time       pred_f        mod_a        mod_b alpha_w beta_w          val
1   17 0.0004542000 0.0007850124 0.0004542000       0      1 0.0012770825
2   18 0.0008752055 0.0008752055 0.0010498573       1      0 0.0013026599
3   19 0.0011927043 0.0009734260 0.0011927043       0      1 0.0008389981
4   20 0.0009864821 0.0009864821 0.0007859345       1      0 0.0002842181

[[12]]
  time      pred_f       mod_a       mod_b alpha_w beta_w         val
1   17 0.001854646 0.001854646 0.002346960     1.0    0.0 0.001265334
2   18 0.002406117 0.002406117 0.001859706     1.0    0.0 0.002095780
3   19 0.002634714 0.003425367 0.002107611     0.4    0.6 0.003082850
4   20 0.002404767 0.002400060 0.002415752     0.7    0.3 0.001890465

[[13]]
  time      pred_f       mod_a       mod_b alpha_w beta_w          val
1   17 0.001214288 0.001214288 0.001226026       1      0 0.0013525503
2   18 0.001372639 0.001239580 0.001372639       0      1 0.0008768654
3   19 0.001215211 0.001215211 0.001270454       1      0 0.0012045310
4   20 0.001047987 0.001047987 0.001075509       1      0 0.0007419980

[[14]]
  time       pred_f        mod_a        mod_b alpha_w beta_w          val
1   17 0.0006198794 0.0008472465 0.0005630376     0.2    0.8 0.0007970520
2   18 0.0008328118 0.0008445807 0.0007857360     0.8    0.2 0.0001358754
3   19 0.0014102224 0.0008426602 0.0014102224     0.0    1.0 0.0008990069
4   20 0.0008361922 0.0008387691 0.0008129996     0.9    0.1 0.0008202798

[[15]]
  time      pred_f       mod_a       mod_b alpha_w beta_w         val
1   17 0.001556142 0.001556142 0.001357333       1      0 0.001831960
2   18 0.001545818 0.001545818 0.001560010       1      0 0.001643141
3   19 0.001521939 0.001545572 0.001521939       0      1 0.001586889
4   20 0.001546706 0.001546706 0.001767904       1      0 0.001668429

[[16]]
  time       pred_f       mod_a        mod_b alpha_w beta_w          val
1   17 0.0006770833 0.001394837 0.0006770833     0.0    1.0 0.0013167039
2   18 0.0016030774 0.001688887 0.0008307884     0.9    0.1 0.0009321254
3   19 0.0010604730 0.001730484 0.0009860273     0.1    0.9 0.0008096176
4   20 0.0005770483 0.001635698 0.0005770483     0.0    1.0 0.0010109599

[[17]]
  time       pred_f        mod_a       mod_b alpha_w beta_w          val
1   17 0.0013945786 0.0008653706 0.001453379     0.1    0.9 0.0016703868
2   18 0.0020582878 0.0018215213 0.002058288     0.0    1.0 0.0011431227
3   19 0.0008880908 0.0008880908 0.001339176     1.0    0.0 0.0009476834
4   20 0.0018048855 0.0018334588 0.001547726     0.9    0.1 0.0021947340

[[18]]
  time      pred_f       mod_a       mod_b alpha_w beta_w         val
1   17 0.002780547 0.002780547 0.001782293     1.0    0.0 0.003299317
2   18 0.002755109 0.002755109 0.001912485     1.0    0.0 0.001602806
3   19 0.001338368 0.002723073 0.001338368     0.0    1.0 0.002310615
4   20 0.002431167 0.002763754 0.001655130     0.7    0.3 0.001912041

[[19]]
  time       pred_f        mod_a        mod_b alpha_w beta_w          val
1   17 0.0007837318 0.0008617958 0.0007837318       0      1 0.0007625953
2   18 0.0007486862 0.0008586767 0.0007486862       0      1 0.0006761409
3   19 0.0014565198 0.0008600704 0.0014565198       0      1 0.0008144871
4   20 0.0008628539 0.0008628539 0.0004255822       1      0 0.0009530073

[[20]]
  time       pred_f        mod_a        mod_b alpha_w beta_w          val
1   17 0.0013848057 0.0007870111 0.0013848057     0.0    1.0 0.0009485897
2   18 0.0007931042 0.0007846858 0.0008127469     0.7    0.3 0.0003690774
3   19 0.0007907497 0.0007907497 0.0014807308     1.0    0.0 0.0006364187
4   20 0.0007887304 0.0007887304 0.0014257837     1.0    0.0 0.0009045089

plot

Code
all_plot <- list()
for(i in 1:length(vol)){
  weight_p = ""
  for(j in 1:nrow(mix[[i]])){
    weight_p = paste(
      weight_p,as.character(mix[[i]]$time[[j]]),"=","(",
      as.character(mix[[i]]$alpha_w[[j]],1),":",as.character(mix[[i]]$beta_w[[j]],1),")"
      )
    }
  all_plot[[i]] <- ggplot(mix[[i]], aes(x=time)) +
    geom_line(aes(y = val,color = "Real Volatility"))+
    geom_line(aes(y = mod_a,color = "Model a"), linetype="twodash")+
    geom_line(aes(y = mod_b,color = "Model b"), linetype="twodash")+
    geom_line(aes(y = pred_f,color = "Mix Model"), linetype="twodash")+
    scale_color_manual(name = "Model", values = c(
      "Real Volatility" = "red",
      "Model a"="lightblue",
      "Model b"="green",
      "Mix Model" = "blue"))+
    theme_classic()+
    labs(
      title = paste("Prediction Result"),
      tag = as.character(i),
      subtitle = paste("cluster ",as.character(cluster_l[[i]]),
                       if(cluster_l[[i]] == 1|cluster_l[[i]] == 3){mod = "HAV + WLR"}
                       else {mod = "EGARCH + WLR"},
                       "\n weight for each time interval : \n",
                       weight_p
                       ),
      x = "Time interval",
      y = "Volatility",
      caption = "each time interval = 30 seconds"
         )#+
    #transition_reveal(time)
}
Code
all_plot
[[1]]


[[2]]


[[3]]


[[4]]


[[5]]


[[6]]


[[7]]


[[8]]


[[9]]


[[10]]


[[11]]


[[12]]


[[13]]


[[14]]


[[15]]


[[16]]


[[17]]


[[18]]


[[19]]


[[20]]

animation plot

Code
all_plot[[5]]+transition_reveal(time)